/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.util.*; import java.util.logging.*; import net.nutch.io.*; import net.nutch.pagedb.*; import net.nutch.linkdb.*; /******************************************** * A BucketSet holds many buckets, full of instructions. * * Once created, it can be given a set of byte arrays. They * will be divided into a handy number of buckets. The BucketSet * will look at a certain byte range within the item to decide * which bucket will get the item. Ideally, your input data * will show an even distribution in the byte-range that you * indicate. * * Once you call close(), the BucketSet cannot be read from * or written to. You must reopen it by allocating a new BucketSet. * * Once you request a single item via getNextItem(), you cannot * again add items to the BucketSet. You can start reading from * the beginning of the BucketSet again by recreating it. * * If you are completely done with the BucketSet, delete() it. * * @author Mike Cafarella ********************************************/ class BucketSet { final static String BUCKET_FILENAME = "bucket"; final static String CONFIG_FILENAME = "config"; final static int INTEGER_SIZE = 32; /** * Create a new BucketSet from a data set that is already there. * You can start adding to the BucketSet again, as long as * you have not yet made a call to getNextItem(). */ public static BucketSet loadBuckets(File bucketsDir) throws IOException { return new BucketSet(bucketsDir); } /** * Create a brand-new BucketSet, at the given File location, * with the appropriate parameters. If the file already exists, * then this will fail. */ public static BucketSet createBuckets(File bucketsDir, int keyStartByte, int keyBits) throws IOException { return new BucketSet(bucketsDir, keyStartByte, keyBits); } // Persistent values File bucketsDir; int keyStartByte, keyBits, curBucket; boolean insertsAllowed; // Transient values DataInputStream inStreams[]; DataOutputStream outStreams[]; int numBuckets; boolean closed; /** * Load an old BucketSet, at bucketsDir. */ BucketSet(File bucketsDir) throws IOException { if (! (bucketsDir.exists() && bucketsDir.isDirectory())) { throw new IOException("File " + bucketsDir + " either does not exist or is not a directory"); } this.bucketsDir = bucketsDir; loadConfig(); if (insertsAllowed) { this.outStreams = new DataOutputStream[numBuckets]; } else { this.inStreams = new DataInputStream[numBuckets]; } this.closed = false; } /** * Create a new BucketSet at bucketsDir, with the given parameters. * The number of buckets is determined by how many bits of the * key you allocate toward bucket-selection. */ BucketSet(File bucketsDir, int keyStartByte, int keyBits) throws IOException { if (keyBits > INTEGER_SIZE) { throw new IOException("Parameter keyBits is too large: " + keyBits); } if (bucketsDir.exists()) { throw new IOException("Directory " + bucketsDir + " is already present."); } else { bucketsDir.mkdir(); } this.bucketsDir = bucketsDir; // Persistent values this.keyStartByte = keyStartByte; this.keyBits = keyBits; this.curBucket = 0; this.insertsAllowed = true; storeConfig(); // Transient ones this.numBuckets = (int) Math.pow(2, keyBits); if (insertsAllowed) { this.outStreams = new DataOutputStream[numBuckets]; } else { this.inStreams = new DataInputStream[numBuckets]; } this.closed = false; } /** * Close down the BucketSet. No more reading or writing. */ public void close() throws IOException { if (closed) { throw new IOException("BucketSet closed"); } for (int i = 0; i < numBuckets; i++) { if (insertsAllowed) { if (outStreams[i] != null) { outStreams[i].close(); } } else { if (inStreams[i] != null) { inStreams[i].close(); } } new File(bucketsDir, BUCKET_FILENAME + "." + i).delete(); } new File(bucketsDir, CONFIG_FILENAME).delete(); bucketsDir.delete(); inStreams = null; outStreams = null; closed = true; } /** * Write the given byte array to the BucketSet. * * Use the array's key region to decide which bucket * will get it. */ public void storeItem(byte item[]) throws IOException { if (closed) { throw new IOException("BucketSet closed"); } if (! insertsAllowed) { throw new IOException("Insert no longer allowed to this BucketSet"); } // Before we can store the item, we first need to // compute which bucket to use int bucket = computeBucket(item); if (outStreams[bucket] == null) { outStreams[bucket] = new DataOutputStream(new FileOutputStream(new File(bucketsDir, BUCKET_FILENAME + "." + bucket))); } outStreams[bucket].writeInt(item.length); outStreams[bucket].write(item, 0, item.length); } /** * Return the next item in the current bucket. * * If we're at the end of a bucket, jump silently to the next. * * If we're at the end of all buckets, return null. */ public byte[] getNextItem() throws IOException { // // Phase 0. Check to make sure it's OK. // // Make sure we're not closed if (closed) { throw new IOException("BucketSet closed"); } // If this is the first call to getNextItem(), we need to prepare! if (insertsAllowed) { // Close down all outstreams, open instreams for (int i = 0; i < outStreams.length; i++) { if (outStreams[i] != null) { outStreams[i].close(); } outStreams[i] = null; } inStreams = new DataInputStream[numBuckets]; outStreams = null; insertsAllowed = false; curBucket = 0; storeConfig(); } // // Phase 1. Find the right bucket // // Move through all buckets till we find something to read int i = 0, itemLen = -1; for (i = curBucket; i < numBuckets; i++) { // First, open stream if necessary if (inStreams[i] == null) { File bucketFile = new File(bucketsDir, BUCKET_FILENAME + "." + i); if (! bucketFile.exists()) { continue; } inStreams[i] = new DataInputStream(new FileInputStream(bucketFile)); } // Second, read from stream how many bytes are in item. // If we hit the end of the stream, we continue to the // next one. if (inStreams[i].available() == 0) { inStreams[i].close(); inStreams[i] = null; } else { itemLen = inStreams[i].readInt(); break; } } // // Phase 2. Remember the bucket, and read the next item // // Remember where we stopped curBucket = i; // Check to see if we found an item, or if we have hit // the end of the bucket set. if (itemLen >= 0) { byte newItem[] = new byte[itemLen]; inStreams[i].readFully(newItem); return newItem; } // We have no more buckets! return null; } /** * Compute which bucket to use, given the input data * (and configuration params). Return bucket index. */ int computeBucket(byte item[]) { int bucketIndex = 0; for (int i = 0; i < keyBits; i++) { byte curByte = item[keyStartByte + (i / 8)]; byte curBit = (byte) (0x01 & (curByte >> (7 - (i % 8)))); bucketIndex = bucketIndex << 1; bucketIndex |= curBit; } return bucketIndex; } /** * Load the BucketSet config file */ void loadConfig() throws IOException { File configFile = new File(bucketsDir, CONFIG_FILENAME); DataInputStream dis = new DataInputStream(new FileInputStream(configFile)); try { this.keyStartByte = dis.readInt(); this.keyBits = dis.readInt(); this.curBucket = dis.readInt(); this.insertsAllowed = dis.readBoolean(); } finally { dis.close(); } } /** * Store values out to the BucketSet config file */ void storeConfig() throws IOException { File configFile = new File(bucketsDir, CONFIG_FILENAME); DataOutputStream dos = new DataOutputStream(new FileOutputStream(configFile)); try { dos.writeInt(keyStartByte); dos.writeInt(keyBits); dos.writeInt(curBucket); dos.writeBoolean(insertsAllowed); } finally { dos.close(); } } }